library(tidyverse) # Colección de paquetes para ciencia de datos (incluye ggplot2, dplyr, tidyr, readr, purrr, tibble)
library(reshape) # Herramientas para reorganizar datos
library(Hmisc) # resúmenes estadÃsticos
library(limma) # Análisis de datos de expresión genética
library(AnnotationDbi) # Interfaz para bases de datos de anotaciones bioinformáticas
library(org.Hs.eg.db) # Datos de anotación para genes humanos
library(VennDiagram) # Generación de diagramas de Venn
library(gridExtra) # Mostrar varias gráficas
library(patchwork) # Combinar múltiples ggplots en un único plot
library(ggrepel) # Mejora la visualización de texto en ggplots evitando solapamientos de texto
library(Rtsne) # Implementación de t-SNE
library(umap) # Implementación de UMAP
library(ggVennDiagram) # diagrama de VennDescargar los datos. Escalo los datos de la matriz para tener homogeneidad en las representaciones.
# covariables
ROSMAP_RINPMIAGESEX_covs <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_covs.rds")
covs <- ROSMAP_RINPMIAGESEX_covs
rownames(covs) <- covs$mrna_id
covs$study <- as.factor(covs$study)
covs$projid <- as.character(covs$projid)
covs$ceradsc <- as.factor(covs$ceradsc)
covs$cogdx <- as.factor(covs$cogdx)
covs$neuroStatus <- as.factor(covs$neuroStatus)
#datos corregidos
ROSMAP_RINPMIAGESEX_resids <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_resids.rds")
mat_exp <- scale(ROSMAP_RINPMIAGESEX_resids)Descarga del geneset de KEGG
# geneset de Alzheimer extraidos de KEGG
tab <- getGeneKEGGLinks(species="hsa")
tab$Symbol <- mapIds(org.Hs.eg.db, tab$GeneID,
column="SYMBOL", keytype="ENTREZID")
paths <- getKEGGPathwayNames(species="hsa")
geneset_alz <- tab$Symbol[tab$PathwayID=="hsa05010"]Vamos a seleccionar solamente los genes de Alzheimer de KEGG en nuestro dataset de expresión génica.
Visualizamos si se han seleccionado todos los genes y cuantos del geneset de KEGG. Nuestra matriz tiene 305 genes de un total 384 del geneset.
venn.plot <- venn.diagram(
x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset_alz),
category.names = c("Matrix Genes", "Geneset Alzheimer"),
filename = NULL,
output = FALSE, # Asegura que no se exporte a un archivo
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = 25, #posicion de las categoricas
cat.dist = 0.1, #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = "Alzheimer genes in exp matrix",
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)# Scree plot con los datos escalados
var_exp <- pca_alz_genes_scaled$sdev^2
prop_var_exp <- var_exp / sum(var_exp)
cum_var_exp <- cumsum(prop_var_exp)
df_var_exp <- data.frame(Comp = 1:length(prop_var_exp), VarExp = prop_var_exp)
df_cum_var_exp <- data.frame(Comp = 1:length(cum_var_exp), CumVarExp = cum_var_exp)
ggplot(df_var_exp[1:20,], aes(x = Comp, y = VarExp)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_line(aes(group = 1), color = "blue") +
geom_point(color = "blue") +
theme_minimal() +
labs(x = "Principal components", y = "Variance", title = "Scree Plot scaled") +
ylim(c(0,1)) +
geom_line(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color="#8B1A1A") +
geom_point(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color = "red") +
geom_bar(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), stat = "identity", fill = "red", alpha= 0.25) +
annotate("text", x = 4, y = 0.85, label = "Cumulative Scree Plot", color = "#8B1A1A", size = 4) +
geom_text(data = df_cum_var_exp[seq(0,20,2),], aes(x=Comp, y = CumVarExp +0.04, label = round(CumVarExp, 2)))Vemos que con las 20 primeras no encontramos ni siquiera el 80% de la varianza. Las dos primeras PC tienen un 41% de la varizana explicada.
Selecciono los outliers de las dos primeras PC con el doble de SD.
outlierspc1_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]),1])
outlierspc2_scaled <- as.data.frame(pca_alz_genes_scaled$x[abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]),2])df <- as.data.frame(pca_alz_genes_scaled$x)
plot1 <- ggplot(df, aes(x = PC1)) +
geom_density(fill = "#00CED1", alpha = 0.5) +
geom_vline(xintercept = mean(df$PC1) + 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
geom_vline(xintercept = mean(df$PC1) - 2*pca_alz_genes_scaled$sdev[1], linetype = "dashed", color = "blue") +
labs(title = "PC1 density with 2*SD scaled PCA") +
geom_text(data = outlierspc1_scaled,
aes(x = outlierspc1_scaled[,1], y= 0, label = rownames(outlierspc1_scaled)),
vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC1, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,1]) > 2*(pca_alz_genes_scaled$sdev[1]), "#8B1A1A", "grey")) +
theme_minimal()
plot2 <- ggplot(df, aes(x = PC2)) +
geom_density(fill = "#00CED1", alpha = 0.5) +
geom_vline(xintercept = mean(df$PC2) + 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
geom_vline(xintercept = mean(df$PC2) - 2*pca_alz_genes_scaled$sdev[2], linetype = "dashed", color = "blue") +
labs(title = "PC2 density with 2*SD scaled PCA") +
geom_text(data = outlierspc2_scaled,
aes(x = outlierspc2_scaled[,1], y= 0, label = rownames(outlierspc2_scaled)),
vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
geom_rug(data = as.data.frame(pca_alz_genes_scaled$x), aes(x= PC2, y = 0), color= ifelse(abs(pca_alz_genes_scaled$x[,2]) > 2*(pca_alz_genes_scaled$sdev[2]), "#8B1A1A", "grey")) +
theme_minimal()
grid.arrange(plot1, plot2, ncol = 1)mPC1_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1_scaled),]
mPC2_scaled <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2_scaled),]
mPC12_scaled <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2_scaled)) & (rownames(mat_exp) %in% rownames(outlierspc1_scaled)),]
not_in_PC12_scaled <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2_scaled)) | (rownames(mat_exp) %in% rownames(outlierspc1_scaled))),]# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1_scaled)), length(rownames(mPC2_scaled)), length(rownames(mPC12_scaled)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1_scaled <- c(rownames(mPC1_scaled), rep("", max_length - length(rownames(mPC1_scaled))))
rownames_mPC2_scaled <- c(rownames(mPC2_scaled), rep("", max_length - length(rownames(mPC2_scaled))))
rownames_mPC12_scaled <- c(rownames(mPC12_scaled), rep("", max_length - length(rownames(mPC12_scaled))))
final_table <- data.frame(
mPC1 = rownames_mPC1_scaled,
mPC2 = rownames_mPC2_scaled,
mPC1_and_PC2 = rownames_mPC12_scaled
)
final_tableVoy ahora a ver de esas muestras seleccionadas, qué covariables están asociadas con esas muestras outliers
rownames(covs) <- covs$mrna_id
covs2 <- covs
for (i in rownames(covs2)){
if (i %in% rownames(mPC1_scaled) & !(i %in% rownames(mPC12_scaled))){
covs2[i, "sampleset_PCA"] <- "mPC1"
}
if (i %in% rownames(mPC2_scaled) & !(i %in% rownames(mPC12_scaled))){
covs2[i, "sampleset_PCA"] <- "mPC2"
}
if (i %in% rownames(mPC12_scaled)){
covs2[i, "sampleset_PCA"] <- "mPC1 and mPC2"
}
if (!(i %in% rownames(mPC1_scaled)) & !(i %in% rownames(mPC2_scaled))){
covs2[i, "sampleset_PCA"] <- "Not in both"
}
}
covs2$sampleset_PCA <- as.factor(covs2$sampleset_PCA)
data.frame(table(covs2$sampleset_PCA))pca.data.kegg <- data.frame(pca_alz_genes_scaled$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")
covs2 <- merge(covs2, pca.data.kegg, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
next
}
if (class(covs2[[i]]) == "factor"){
filt <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_PCA) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_PCA == "mPC1" | sampleset_PCA == "mPC2" | sampleset_PCA == "mPC1 and mPC2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
group_by(across(all_of(col_name))) %>%
mutate(suma.total = sum(percentage)) %>%
ungroup()
p <- ggplot(filt, aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA")) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=paste0(round(suma.total, 1), " %"), y = suma.total), label.size = 10, color = "black") +
labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
theme_minimal()
if (length(unique(covs2[[i]])) >= 2) {
p <- p +
geom_segment(aes(x = 1, xend = 1, y = 0, yend = suma.total[1]-1), color = "red", size = 3)
p <- p + geom_segment(aes(x = 2, xend = 2, y = 0, yend = suma.total[3]-1), color = "red", size = 3)
if (length(unique(covs2[[i]])) >= 3){
p <- p + geom_segment(aes(x = 3, xend = 3, y = 0, yend = suma.total[5]-1), color = "red", size = 3)
if (length(unique(covs2[[i]])) >= 4) {
p <- p + geom_segment(aes(x = 4, xend = 4, y = 0, yend = suma.total[7]-1), color = "red", size = 3)
if (length(unique(covs2[[i]])) >= 5) {
p <- p + geom_segment(aes(x = 5, xend = 5, y = 0, yend = suma.total[9]-1), color = "red", size = 3)
}
}
}
}
}
plots[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)plots.pca.kegg <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1_scaled) | mrna_id %in% rownames(outlierspc2_scaled), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 10, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "PC1.KEGG", y = "PC2.KEGG", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.pca.kegg[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.pca.kegg$study, plots.pca.kegg$braaksc, ncol=1)set.seed(1234)
tsne <- Rtsne(mat_exp_alz_genes, dims = 2, theta = 0.0)
tsne.data <- as.data.frame(tsne$Y)
row.names(tsne.data) <- row.names(mat_exp_alz_genes)
tsne.data.covs <- merge(tsne.data, covs, by = "row.names")
tsne.data.covs$Row.names <- NULL
row.names(tsne.data.covs) <- tsne.data.covs$mrna_idrownames(tsne.data) <- rownames(mat_exp_alz_genes)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data[,1])
sd.tsne1 <- sd(tsne.data[,1])
mean.tsne2 <- mean(tsne.data[,2])
sd.tsne2 <- sd(tsne.data[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1 <- tsne.data[abs(tsne.data[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2 <-tsne.data[abs(tsne.data[,2] - mean.tsne2) > 2 * sd.tsne2, ]mtSNE1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1),]
mtSNE2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2),]
mtSNE12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1)) & (rownames(mat_exp) %in% rownames(outliers.tsne2)),]
not_in_tSNE12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1)) | (rownames(mat_exp) %in% rownames(outliers.tsne2))),]for (i in rownames(covs2)){
if (i %in% rownames(mtSNE1) & !(i %in% rownames(mtSNE12))){
covs2[i, "sampleset_tSNE"] <- "mtSNE1"
}
if (i %in% rownames(mtSNE2) & !(i %in% rownames(mtSNE12))){
covs2[i, "sampleset_tSNE"] <- "mtSNE2"
}
if (i %in% rownames(mtSNE12)){
covs2[i, "sampleset_tSNE"] <- "mtSNE1 and mtSNE2"
}
if (!(i %in% rownames(mtSNE1)) & !(i %in% rownames(mtSNE2))){
covs2[i, "sampleset_tSNE"] <- "Not in both"
}
}
covs2$sampleset_tSNE <- as.factor(covs2$sampleset_tSNE)
data.frame(table(covs2$sampleset_tSNE))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1)), length(rownames(mtSNE2)), length(rownames(mtSNE12)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1 <- c(rownames(mtSNE1), rep("", max_length - length(rownames(mtSNE1))))
rownames_tsne2 <- c(rownames(mtSNE2), rep("", max_length - length(rownames(mtSNE2))))
rownames_tsne12 <- c(rownames(mtSNE12), rep("", max_length - length(rownames(mtSNE12))))
final_table <- data.frame(
mtSNE1 = rownames_tsne1,
mtSNE2 = rownames_tsne2,
mtSNE1_and_mtSNE2 = rownames_tsne12
)
final_tablecolnames(tsne.data) <- c("tSNE1.KEGG", "tSNE2.KEGG")
covs2 <- merge(covs2, tsne.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_tSNE) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_tSNE == "mtSNE1" | sampleset_tSNE == "mtSNE2" | sampleset_tSNE == "mtSNE1 and mtSNE2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)plots.tsne.kegg <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.tsne1) | mrna_id %in% rownames(outliers.tsne2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "tSNE1.KEGG", y = "tSNE2.KEGG", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.tsne.kegg[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.tsne.kegg$study, plots.tsne.kegg$braaksc, ncol=1)local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad <- umap(mat_exp_alz_genes,random_stage=1234, local.config)
umap.data <- as.data.frame(umap.ad$layout)
row.names(umap.data) <- row.names(mat_exp_alz_genes)
umap.data.covs <- merge(umap.data, covs2, by = "row.names")
umap.data.covs$Row.names <- NULL
row.names(umap.data.covs) <- umap.data.covs$mrna_idrownames(umap.data) <- rownames(mat_exp_alz_genes)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data[,1])
sd.umap1 <- sd(umap.data[,1])
mean.umap2 <- mean(umap.data[,2])
sd.umap2 <- sd(umap.data[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1 <- umap.data[abs(umap.data[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2 <-umap.data[abs(umap.data[,2] - mean.umap2) > 2 * sd.umap2, ]mUMAP1 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1),]
mUMAP2 <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2),]
mUMAP12 <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1)) & (rownames(mat_exp) %in% rownames(outliers.umap2)),]
not_in_UMAP12 <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1)) | (rownames(mat_exp) %in% rownames(outliers.umap2))),]for (i in rownames(covs2)){
if (i %in% rownames(mUMAP1) & !(i %in% rownames(mUMAP12))){
covs2[i, "sampleset_UMAP"] <- "mUMAP1"
}
if (i %in% rownames(mUMAP2) & !(i %in% rownames(mUMAP12))){
covs2[i, "sampleset_UMAP"] <- "mUMAP2"
}
if (i %in% rownames(mUMAP12)){
covs2[i, "sampleset_UMAP"] <- "mUMAP1 and mUMAP2"
}
if (!(i %in% rownames(mUMAP1)) & !(i %in% rownames(mUMAP2))){
covs2[i, "sampleset_UMAP"] <- "Not in both"
}
}
covs2$sampleset_UMAP <- as.factor(covs2$sampleset_UMAP)
data.frame(table(covs2$sampleset_UMAP))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1)), length(rownames(mUMAP2)), length(rownames(mUMAP12)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1 <- c(rownames(mUMAP1), rep("", max_length - length(rownames(mUMAP1))))
rownames_umap2 <- c(rownames(mUMAP2), rep("", max_length - length(rownames(mUMAP2))))
rownames_umap12 <- c(rownames(mUMAP12), rep("", max_length - length(rownames(mUMAP12))))
final_table <- data.frame(
mUMAP1 = rownames_umap1,
mUMAP2 = rownames_umap2,
mUMAP1_and_mUMAP2 = rownames_umap12
)
final_tableplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_UMAP) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_UMAP == "mUMAP1" | sampleset_UMAP == "mUMAP2" | sampleset_UMAP == "mUMAP1 and mUMAP2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)colnames(umap.data) <- c("UMAP1.KEGG", "UMAP2.KEGG")
covs2 <- merge(covs2, umap.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.umap.kegg <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.umap1) | mrna_id %in% rownames(outliers.umap2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "UMAP1.KEGG", y = "UMAP2.KEGG", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.umap.kegg[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.umap.kegg$study, plots.umap.kegg$braaksc, ncol=1)geneset.AD.Andrews <- read_delim("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/geneset AD Andrews 2023 Lancet.csv", delim = ";", escape_double = FALSE, trim_ws = TRUE)
geneset.AD.Andrews <- unique(geneset.AD.Andrews$Gene)venn.plot <- venn.diagram(
x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset.AD.Andrews),
category.names = c("Matrix Genes", "Geneset Alzheimer"),
filename = NULL,
output = FALSE, # Asegura que no se exporte a un archivo
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = 25, #posicion de las categoricas
cat.dist = 0.1, #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = "Alzheimer genes in exp matrix Andrews",
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)Seleccionamos muestras que esten 2 veces la desviacion tipica para la PC1 y PC2
outlierspc1.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,1]) > 2*(pca.AD.Andrews$sdev[1]),1])
outlierspc2.Andrews <- as.data.frame(pca.AD.Andrews$x[abs(pca.AD.Andrews$x[,2]) > 2*(pca.AD.Andrews$sdev[2]),2])Las ploteamos
Creamos los grupos de muestras seleccionados de los outliers de PC1 y/o PC2
mPC1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc1.Andrews),]
mPC2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outlierspc2.Andrews),]
mPC12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) & (rownames(mat_exp) %in% rownames(outlierspc1.Andrews)),]
not_in_PC12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outlierspc2.Andrews)) | (rownames(mat_exp) %in% rownames(outlierspc1.Andrews))),]rownames(covs2) <- covs2$mrna_id
for (i in rownames(covs2)){
if (i %in% rownames(mPC1.Andrews) & !(i %in% rownames(mPC12.Andrews))){
covs2[i, "sampleset_PCA_Andrews"] <- "mPC1"
}
if (i %in% rownames(mPC2.Andrews) & !(i %in% rownames(mPC12.Andrews))){
covs2[i, "sampleset_PCA_Andrews"] <- "mPC2"
}
if (i %in% rownames(mPC12.Andrews)){
covs2[i, "sampleset_PCA_Andrews"] <- "mPC1 and mPC2"
}
if (!(i %in% rownames(mPC1.Andrews)) & !(i %in% rownames(mPC2.Andrews))){
covs2[i, "sampleset_PCA_Andrews"] <- "Not in both"
}
}
covs2$sampleset_PCA_Andrews <- as.factor(covs2$sampleset_PCA_Andrews)
data.frame(table(covs2$sampleset_PCA_Andrews))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mPC1.Andrews)), length(rownames(mPC2.Andrews)), length(rownames(mPC12.Andrews)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1.Andrews <- c(rownames(mPC1.Andrews), rep("", max_length - length(rownames(mPC1.Andrews))))
rownames_mPC2.Andrews <- c(rownames(mPC2.Andrews), rep("", max_length - length(rownames(mPC2.Andrews))))
rownames_mPC12.Andrews <- c(rownames(mPC12.Andrews), rep("", max_length - length(rownames(mPC12.Andrews))))
final_table <- data.frame(
mPC1 = rownames_mPC1.Andrews,
mPC2 = rownames_mPC2.Andrews,
mPC1_and_PC2 = rownames_mPC12.Andrews
)
final_tableMetemos una nueva variable en las covariables dependiendo de si son outliers de PC1 o del PC2 o de ninguno.
plots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
next
}
if (class(covs2[[i]]) == "factor"){
filt <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_PCA_Andrews) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_PCA_Andrews == "mPC1" | sampleset_PCA_Andrews == "mPC2" | sampleset_PCA_Andrews == "mPC1 and mPC2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
group_by(across(all_of(col_name))) %>%
mutate(suma.total = sum(percentage)) %>%
ungroup()
p <- ggplot(filt, aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_PCA_Andrews")) +
geom_bar(stat="identity", position=position_dodge()) +
geom_text(aes(label=paste0(round(suma.total, 1), " %"), y = suma.total), label.size = 10, color = "black") +
labs(x = names(covs2)[i], y = "%", fill = "Sample Set PCA") +
theme_minimal()
if (length(unique(covs2[[i]])) >= 2) {
p <- p +
geom_segment(aes(x = 1, xend = 1, y = 0, yend = suma.total[1]-1), color = "red", size = 3)
p <- p + geom_segment(aes(x = 2, xend = 2, y = 0, yend = suma.total[3]-1), color = "red", size = 3)
if (length(unique(covs2[[i]])) >= 3){
p <- p + geom_segment(aes(x = 3, xend = 3, y = 0, yend = suma.total[5]-1), color = "red", size = 3)
if (length(unique(covs2[[i]])) >= 4) {
p <- p + geom_segment(aes(x = 4, xend = 4, y = 0, yend = suma.total[7]-1), color = "red", size = 3)
if (length(unique(covs2[[i]])) >= 5) {
p <- p + geom_segment(aes(x = 5, xend = 5, y = 0, yend = suma.total[9]-1), color = "red", size = 3)
}
}
}
}
}
plots[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)pca.data.Andrews <- data.frame(pca.AD.Andrews$x)[,1:2]
colnames(pca.data.Andrews) <- c("PCA1.Andrews", "PCA2.Andrews")
covs2 <- merge(covs2, pca.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.pca.Andrews <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1.Andrews) | mrna_id %in% rownames(outlierspc2.Andrews), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 10, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "PCA1.Andrews", y = "PCA2.Andrews", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.pca.Andrews[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.pca.Andrews$study, plots.pca.Andrews$braaksc, ncol=1)set.seed(1234)
tsne.Andrews <- Rtsne(mat.expre.AD.Andrews, dims = 2, theta = 0.0)
tsne.data.Andrews <- as.data.frame(tsne.Andrews$Y)
row.names(tsne.data.Andrews) <- row.names(mat.expre.AD.Andrews)
tsne.Andrews.covs <- merge(tsne.data.Andrews, covs, by = "row.names")
tsne.Andrews.covs$Row.names <- NULL
row.names(tsne.Andrews.covs) <- tsne.Andrews.covs$mrna_idrownames(tsne.data.Andrews) <- rownames(mat.expre.AD.Andrews)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data.Andrews[,1])
sd.tsne1 <- sd(tsne.data.Andrews[,1])
mean.tsne2 <- mean(tsne.data.Andrews[,2])
sd.tsne2 <- sd(tsne.data.Andrews[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1.Andrews <- tsne.data.Andrews[abs(tsne.data.Andrews[,1] - mean.tsne1) > 2 * sd.tsne1, ]
outliers.tsne2.Andrews <-tsne.data.Andrews[abs(tsne.data.Andrews[,2] - mean.tsne2) > 2 * sd.tsne2, ]mtSNE1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews),]
mtSNE2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews),]
mtSNE12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews)),]
not_in_tSNE12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.tsne1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.tsne2.Andrews))),]for (i in rownames(covs2)){
if (i %in% rownames(mtSNE1.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1"
}
if (i %in% rownames(mtSNE2.Andrews) & !(i %in% rownames(mtSNE12.Andrews))){
covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE2"
}
if (i %in% rownames(mtSNE12.Andrews)){
covs2[i, "sampleset_tSNE_Andrews"] <- "mtSNE1 and mtSNE2"
}
if (!(i %in% rownames(mtSNE1.Andrews)) & !(i %in% rownames(mtSNE2.Andrews))){
covs2[i, "sampleset_tSNE_Andrews"] <- "Not in both"
}
}
covs2$sampleset_tSNE_Andrews <- as.factor(covs2$sampleset_tSNE_Andrews)
data.frame(table(covs2$sampleset_tSNE_Andrews))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mtSNE1.Andrews)), length(rownames(mtSNE2.Andrews)), length(rownames(mtSNE12.Andrews)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_tsne1.Andrews <- c(rownames(mtSNE1.Andrews), rep("", max_length - length(rownames(mtSNE1.Andrews))))
rownames_tsne2.Andrews <- c(rownames(mtSNE2.Andrews), rep("", max_length - length(rownames(mtSNE2.Andrews))))
rownames_tsne12.Andrews <- c(rownames(mtSNE12.Andrews), rep("", max_length - length(rownames(mtSNE12.Andrews))))
final_table <- data.frame(
mtSNE1 = rownames_tsne1.Andrews,
mtSNE2 = rownames_tsne2.Andrews,
mtSNE1_and_mtSNE2 = rownames_tsne12.Andrews
)
final_tableplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric" ){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_tSNE_Andrews) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_tSNE_Andrews == "mtSNE1" | sampleset_tSNE_Andrews == "mtSNE2" | sampleset_tSNE_Andrews == "mtSNE1 and mtSNE2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_tSNE_Andrews")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = names(covs2)[i], y = "Percentage", fill = "Outliers") +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc, plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)colnames(tsne.data.Andrews) <- c("tSNE1.Andrews", "tSNE2.Andrews")
covs2 <- merge(covs2, tsne.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.tsne.Andrews <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.tsne1.Andrews) | covs2$mrna_id %in% rownames(outliers.tsne2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30, # Reduce el número máximo de solapamientos
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "tSNE1.Andrews", y = "tSNE2.Andrews", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.tsne.Andrews[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.tsne.Andrews$study, plots.tsne.Andrews$braaksc, ncol=1)local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad.Andrews <- umap(mat.expre.AD.Andrews,random_stage=1234, local.config)
umap.data.Andrews <- as.data.frame(umap.ad.Andrews$layout)
row.names(umap.data.Andrews) <- row.names(mat.expre.AD.Andrews)
umap.data.covs.Andrews <- merge(umap.data.Andrews, covs2, by = "row.names")
umap.data.covs.Andrews$Row.names <- NULL
row.names(umap.data.covs.Andrews) <- umap.data.covs.Andrews$mrna_idrownames(umap.data.Andrews) <- rownames(mat.expre.AD.Andrews)
# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data.Andrews[,1])
sd.umap1 <- sd(umap.data.Andrews[,1])
mean.umap2 <- mean(umap.data.Andrews[,2])
sd.umap2 <- sd(umap.data.Andrews[,2])
# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1.Andrews <- umap.data.Andrews[abs(umap.data.Andrews[,1] - mean.umap1) > 2 * sd.umap1, ]
outliers.umap2.Andrews <-umap.data.Andrews[abs(umap.data.Andrews[,2] - mean.umap2) > 2 * sd.umap2, ]mUMAP1.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap1.Andrews),]
mUMAP2.Andrews <- mat_exp[rownames(mat_exp) %in% rownames(outliers.umap2.Andrews),]
mUMAP12.Andrews <- mat_exp[(rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) & (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews)),]
not_in_UMAP12.Andrews <- mat_exp[!((rownames(mat_exp) %in% rownames(outliers.umap1.Andrews)) | (rownames(mat_exp) %in% rownames(outliers.umap2.Andrews))),]for (i in rownames(covs2)){
if (i %in% rownames(mUMAP1.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1"
}
if (i %in% rownames(mUMAP2.Andrews) & !(i %in% rownames(mUMAP12.Andrews))){
covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP2"
}
if (i %in% rownames(mUMAP12.Andrews)){
covs2[i, "sampleset_UMAP_Andrews"] <- "mUMAP1 and mUMAP2"
}
if (!(i %in% rownames(mUMAP1.Andrews)) & !(i %in% rownames(mUMAP2.Andrews))){
covs2[i, "sampleset_UMAP_Andrews"] <- "Not in both"
}
}
covs2$sampleset_UMAP_Andrews <- as.factor(covs2$sampleset_UMAP_Andrews)
data.frame(table(covs2$sampleset_UMAP_Andrews))# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(rownames(mUMAP1.Andrews)), length(rownames(mUMAP2.Andrews)), length(rownames(mUMAP12.Andrews)))
# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_umap1.Andrews <- c(rownames(mUMAP1.Andrews), rep("", max_length - length(rownames(mUMAP1.Andrews))))
rownames_umap2.Andrews <- c(rownames(mUMAP2.Andrews), rep("", max_length - length(rownames(mUMAP2.Andrews))))
rownames_umap12.Andrews <- c(rownames(mUMAP12.Andrews), rep("", max_length - length(rownames(mUMAP12.Andrews))))
final_table <- data.frame(
mUMAP1 = rownames_umap1.Andrews,
mUMAP2 = rownames_umap2.Andrews,
mUMAP1_and_mUMAP2 = rownames_umap12.Andrews
)
final_tableplots <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 1:ncol(covs2)) {
col_name <- colnames(covs2)[i]
if (class(covs2[[i]]) == "character" | class(covs2[[i]]) == "numeric"){
next
}
if (class(covs2[[i]]) == "factor"){
p <- covs2 %>%
group_by(across(all_of(col_name)), sampleset_UMAP_Andrews) %>%
summarise(count = n(), .groups = 'drop') %>%
filter(sampleset_UMAP_Andrews == "mUMAP1" | sampleset_UMAP_Andrews == "mUMAP2" | sampleset_UMAP_Andrews == "mUMAP1 and mUMAP2") %>%
mutate(total = sum(count), percentage = (count / total) * 100) %>%
ggplot(aes_string(x = names(covs2)[i], y = "percentage", fill = "sampleset_UMAP_Andrews")) +
geom_bar(stat="identity", position=position_dodge()) +
labs(x = NULL, y = "%", fill = col_name) +
theme_minimal()
plots[[colnames(covs2)[i]]] <- p
}
}
grid.arrange(plots$study , plots$braaksc,plots$ceradsc, plots$cogdx, plots$apoe_genotype, plots$neuroStatus, ncol=2)colnames(umap.data.Andrews) <- c("UMAP1.Andrews", "UMAP2.Andrews")
covs2 <- merge(covs2, umap.data.Andrews, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULLplots.umap.Andrews <- list()
# Itera sobre las columnas de covariables y añadir a la lista
for (i in 3:13) {
# Crea una gráfica para cada variable
p <- ggplot(covs2, aes_string(x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i])) +
geom_point(show.legend = TRUE) +
geom_text_repel(aes(label=ifelse((covs2$mrna_id %in% rownames(outliers.umap1.Andrews) | covs2$mrna_id %in% rownames(outliers.umap2.Andrews)), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
color = "black",
max.overlaps = 30,
point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
size = 3,
fontface = "bold",
segment.size = 0.2, # LÃneas de guÃa más finas
segment.color = 'grey50',
max.segment.length = unit(0.5, "lines"), # LÃneas de guÃa más cortas
arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
theme_minimal() +
labs(title = names(covs2)[i],
x = "UMAP1.Andrews", y = "UMAP2.Andrews", color = names(covs2)[i]) +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 12),
text = element_text(size = 12),
legend.key.size = unit(0.5, "cm"),
plot.margin = margin(5, 5, 5, 5)# Agrega un margen alrededor de la gráfica si es necesario
)
plots.umap.Andrews[[colnames(covs2)[i]]] <- p
}
grid.arrange(plots.umap.Andrews$study, plots.umap.Andrews$braaksc, ncol=1)grid.arrange(plots.pca.kegg$neuroStatus, plots.pca.Andrews$neuroStatus,plots.tsne.kegg$neuroStatus, plots.tsne.Andrews$neuroStatus, plots.umap.kegg$neuroStatus, plots.umap.Andrews$neuroStatus,ncol = 2)all.outliers <- list(PCA.KEGG = c(rownames(outlierspc1_scaled), rownames(outlierspc2_scaled)),
tSNE.KEGG = c(rownames(outliers.tsne1), rownames(outliers.tsne2)),
UMAP.KEGG = c(rownames(outliers.umap1), rownames(outliers.umap2)),
PCA.Andrews = c(rownames(outlierspc1.Andrews), rownames(outlierspc2.Andrews)),
tSNE.Andrews = c(rownames(outliers.tsne1.Andrews), rownames(outliers.tsne2.Andrews)),
UMAP.Andrews = c(rownames(outliers.umap1.Andrews), rownames(outliers.umap2.Andrews))
)
max_length <- max(sapply(all.outliers, length))
# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
length(x) <- max_length # Esto rellenará con NA si la lista es más corta que max_length
x
}
# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)
# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)
all.outliers# Convertimos el data frame a un vector para hacer la tabla
all_values.outliers <- unlist(all.outliers, use.names = FALSE)
# Usamos table() para contar las frecuencias de cada muestra
table_values <- table(all_values.outliers)
# Filtramos para mostrar solo las muestras que se repiten
repeated_samples <- names(table_values[table_values > 1])
# Unlist with names to track the original column of each value
all_values_named <- unlist(all.outliers, use.names = TRUE)
# Extract unique non-NA samples
unique_samples <- unique(all_values_named[!is.na(all_values_named)])
# Create an empty matrix to fill with presence (1) or absence (0)
incidence_matrix <- matrix(0, nrow = length(unique_samples), ncol = length(all.outliers))
rownames(incidence_matrix) <- unique_samples
colnames(incidence_matrix) <- names(all.outliers)
# Populate the incidence matrix
for (col in names(all.outliers)) {
present_samples <- all.outliers[[col]][!is.na(all.outliers[[col]])] # Extract non-NA samples from column
incidence_matrix[rownames(incidence_matrix) %in% present_samples, col] <- 1
}
# Melt the incidence matrix for ggplot
melted_incidence_matrix <- melt(incidence_matrix, varnames = c("sample", "column"), value.name = "presence")
colnames(melted_incidence_matrix)[3] <- "presence"
# Convertir la matriz de incidencia en un data frame
incidence_df <- as.data.frame(incidence_matrix)
# Crear un data frame para contar y listar las columnas de aparición
summary_df <- data.frame(
sample = rownames(incidence_df),
count = rowSums(incidence_df),
columns = apply(incidence_df, 1, function(row) paste(names(df)[as.logical(row)], collapse = ", "))
)
# Ordenar el summary_df para mejorar la visualización
summary_df <- summary_df %>%
arrange(desc(count)) %>%
mutate(sample = factor(sample, levels = sample))
# Crear el gráfico de barras
ggplot(summary_df, aes(x = sample, y = count, fill = count)) + # Mostrar solo las primeras 20 muestras para claridad
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
coord_flip() + # Invertir el gráfico para mejor visualización de etiquetas
labs(x = "Sample", y = "Count of Appearances", title = "Sample Appearances Across dimensions") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
axis.title.x = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face = "bold"),
plot.title = element_text(size = 14, face = "bold", hjust = 0.5))# Calcular el número total de presencias para cada muestra
total_presences <- rowSums(incidence_matrix)
melted_incidence_matrix$total_presence <- total_presences[as.character(melted_incidence_matrix$sample)]
melted_incidence_matrix$column <- gsub(pattern = "outliers.", replacement = "", x = melted_incidence_matrix$column)
melted_incidence_matrix$sample <- gsub(pattern = "_.*", replacement = "", x = melted_incidence_matrix$sample)
filtered_data <- melted_incidence_matrix %>%
filter(presence == 1) %>%
arrange(desc(total_presence)) %>%
mutate(sample = factor(sample, levels = unique(sample)))
ggplot(filtered_data, aes(x = column, y = sample, size = total_presence)) +
geom_point(color = "steelblue") +
scale_size(range = c(2, 6), guide = "none") + # Adjustable size scale with no size legend
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right") +
labs(y = "Sample", x= "Dimension", title = "Presence of Samples Across Dimensions", size = "Total Presence")venn.plot <- venn.diagram(
x = lista.outliers[1:3],
category.names = c("PCA", "tSNE", "UMAP"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#21908dff', '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0, 0), #posicion de las categoricas
cat.dist = 0.1, #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn KEGG geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"),
x = 0.5,
y = 0.3,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"),
x = 0.68,
y = 0.33,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_AC <- Reduce(intersect, lista.outliers[c(1,3)])
grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"),
x = 0.365,
y = 0.25,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[4:6],
category.names = c("PCA", "tSNE", "UMAP"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#21908dff', '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1, -0.05), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn Andrews geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 1)
)
interseccion_ABC <- Reduce(intersect, lista.outliers[4:6])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_ABC), collapse = "\n"),
x = 0.58,
y = 0.35,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_AC <- Reduce(intersect, lista.outliers[c(4,6)])
grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"),
x = 0.42,
y = 0.35,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_BC <- Reduce(intersect, lista.outliers[5:6])
grid.text(label = paste(gsub("_.*","",interseccion_BC[!(interseccion_BC %in% interseccion_ABC)]), collapse = "\n"),
x = 0.68,
y = 0.4,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))
interseccion_AB <- Reduce(intersect, lista.outliers[4:5])
grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"),
x = 0.56,
y = 0.7,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[c(1,4)],
category.names = c("KEGG", "Andrews"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn PCA geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_AB <- Reduce(intersect, lista.outliers[c(1,4)])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"),
x = 0.58,
y = 0.4,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[c(2,5)],
category.names = c("KEGG", "Andrews"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn tSNE geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_AB <- Reduce(intersect, lista.outliers[c(2,5)])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"),
x = 0.63,
y = 0.4,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))venn.plot <- venn.diagram(
x = lista.outliers[c(3,6)],
category.names = c("KEGG", "Andrews"),
filename = NULL,
output = FALSE,
fill = c("#440154ff", '#fde725ff'),
cex = 1, # Aumenta el tamaño del texto
fontface = "bold",
cat.cex = 1, # Aumenta el tamaño del texto de las categorÃas
cat.fontface = "bold",
cat.default.pos = "text",
cat.pos = c(0, 0), #posicion de las categoricas
cat.dist = c(0.1, 0.1), #distancia de las categoricas
rotation.degree = 0,
margin = 0.1, # hacerla más pequeña
lwd=0.5,
lty = "dashed", # Estilo de lÃnea discontinua
edge.col = "grey", # Color de los bordes
main = paste0("Diagrama Venn UMAP geneset"),
main.fontface= "bold",
main.cex = 2,
main.pos = c(0.5, 0.9)
)
interseccion_AB <- Reduce(intersect, lista.outliers[c(3,6)])
grid.newpage() # Asegura que el lienzo esté limpio
grid.draw(venn.plot)
grid.text(label = paste(gsub("_.*","",interseccion_AB), collapse = "\n"),
x = 0.52,
y = 0.3,
gp = gpar(fontsize = 8,
col = "black",
fontface = "italic"))